Prediction model



In [1]:

    
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import os
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR, LinearSVR
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import validation_curve
from sklearn.model_selection import learning_curve









    



/Users/Home/anaconda/lib/python2.7/site-packages/matplotlib/font_manager.py:273: UserWarning: Matplotlib is building the font cache using fc-list. This may take a moment.
  warnings.warn('Matplotlib is building the font cache using fc-list. This may take a moment.')

Import data

Might still need to clean up the files some after import



In [2]:

    
path = '../Clean Data'
X_fn = 'X.csv'
y_fn = 'y.csv'
X_path = os.path.join(path, X_fn)
y_path = os.path.join(path, y_fn)

X = pd.read_csv(X_path)
y = pd.read_csv(y_path)



In [3]:

    
X.head()









    Out[3]:






  
    
      
      Unnamed: 0
      cluster_id_6
      Year
      nameplate_capacity
      DATETIME
      GROSS LOAD (MW)
      ERCOT Load, MW
      Total Wind Installed, MW
      Total Wind Output, MW
      Wind Output, % of Installed
      Wind Output, % of Load
      1-hr MW change
      1-hr % change
      Net Load (MW)
      Net Load Change (MW)
      Month
      NG Price ($/mcf)
      All coal
      Lignite
      Subbituminous
    
  
  
    
      0
      0
      0
      2007
      5949.0
      2007-01-01 00:00:00
      4596.0
      30428.0
      2790.0
      1074.0
      38.494624
      3.529644
      NaN
      NaN
      29354.0
      NaN
      1
      6.42
      25.1475
      20.0275
      28.115
    
    
      1
      1
      0
      2007
      5949.0
      2007-01-01 01:00:00
      4566.0
      30133.0
      2790.0
      922.6
      33.068100
      3.061760
      -151.4
      -14.096834
      29210.4
      -143.6
      1
      6.42
      25.1475
      20.0275
      28.115
    
    
      2
      2
      0
      2007
      5949.0
      2007-01-01 02:00:00
      4667.0
      29941.0
      2790.0
      849.2
      30.437276
      2.836245
      -73.4
      -7.955777
      29091.8
      -118.6
      1
      6.42
      25.1475
      20.0275
      28.115
    
    
      3
      3
      0
      2007
      5949.0
      2007-01-01 03:00:00
      4668.0
      29949.0
      2790.0
      1056.3
      37.860215
      3.526996
      207.1
      24.387659
      28892.7
      -199.1
      1
      6.42
      25.1475
      20.0275
      28.115
    
    
      4
      4
      0
      2007
      5949.0
      2007-01-01 04:00:00
      4685.0
      30248.0
      2790.0
      837.1
      30.003584
      2.767456
      -219.2
      -20.751680
      29410.9
      518.2
      1
      6.42
      25.1475
      20.0275
      28.115

Rename the cluster column to just cluster. This won't be needed once we export from the group classification with the correct column name



In [3]:

    
X.rename(columns={'cluster_id_6':'cluster'}, inplace=True)

Ratio of prices

Divide coal prices by natural gas price, drop natural gas price



In [4]:

    
for fuel in ['All coal', 'Lignite', 'Subbituminous']:
    X.loc[:,fuel] = X.loc[:,fuel].values/X.loc[:,'NG Price ($/mcf)'].values
    
X.drop('NG Price ($/mcf)', axis=1, inplace=True)

One-hot encoding of the cluster variable

I'm trying to make this easy for using with different numbers of clusters



In [5]:

    
cluster_ids = X['cluster'].unique()
for cluster in cluster_ids:
    X['cluster_{}'.format(cluster)] = np.eye(len(cluster_ids))[X['cluster'],cluster]



In [6]:

    
X.head()









    Out[6]:






  
    
      
      Unnamed: 0
      cluster
      Year
      nameplate_capacity
      DATETIME
      GROSS LOAD (MW)
      ERCOT Load, MW
      Total Wind Installed, MW
      Total Wind Output, MW
      Wind Output, % of Installed
      ...
      Month
      All coal
      Lignite
      Subbituminous
      cluster_0
      cluster_1
      cluster_2
      cluster_3
      cluster_4
      cluster_5
    
  
  
    
      0
      0
      0
      2007
      5949.0
      2007-01-01 00:00:00
      4596.0
      30428.0
      2790.0
      1074.0
      38.494624
      ...
      1
      3.917056
      3.119548
      4.379283
      1.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      1
      1
      0
      2007
      5949.0
      2007-01-01 01:00:00
      4566.0
      30133.0
      2790.0
      922.6
      33.068100
      ...
      1
      3.917056
      3.119548
      4.379283
      1.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      2
      2
      0
      2007
      5949.0
      2007-01-01 02:00:00
      4667.0
      29941.0
      2790.0
      849.2
      30.437276
      ...
      1
      3.917056
      3.119548
      4.379283
      1.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      3
      3
      0
      2007
      5949.0
      2007-01-01 03:00:00
      4668.0
      29949.0
      2790.0
      1056.3
      37.860215
      ...
      1
      3.917056
      3.119548
      4.379283
      1.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      4
      4
      0
      2007
      5949.0
      2007-01-01 04:00:00
      4685.0
      30248.0
      2790.0
      837.1
      30.003584
      ...
      1
      3.917056
      3.119548
      4.379283
      1.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
  

5 rows × 25 columns



In [8]:

    
X.tail()









    Out[8]:






  
    
      
      Unnamed: 0
      cluster
      Year
      nameplate_capacity
      DATETIME
      GROSS LOAD (MW)
      ERCOT Load, MW
      Total Wind Installed, MW
      Total Wind Output, MW
      Wind Output, % of Installed
      ...
      Month
      All coal
      Lignite
      Subbituminous
      cluster_0
      cluster_1
      cluster_2
      cluster_3
      cluster_4
      cluster_5
    
  
  
    
      473329
      473329
      5
      2015
      11476.0
      2015-12-31 19:00:00
      7516.0
      39908.77734
      16170.0
      3824.932373
      23.654498
      ...
      12
      12.793722
      10.780269
      14.596413
      0.0
      0.0
      0.0
      0.0
      0.0
      1.0
    
    
      473330
      473330
      5
      2015
      11476.0
      2015-12-31 20:00:00
      6552.0
      38736.85938
      16170.0
      4625.632813
      28.606264
      ...
      12
      12.793722
      10.780269
      14.596413
      0.0
      0.0
      0.0
      0.0
      0.0
      1.0
    
    
      473331
      473331
      5
      2015
      11476.0
      2015-12-31 21:00:00
      5944.0
      37587.70313
      16170.0
      4957.714844
      30.659956
      ...
      12
      12.793722
      10.780269
      14.596413
      0.0
      0.0
      0.0
      0.0
      0.0
      1.0
    
    
      473332
      473332
      5
      2015
      11476.0
      2015-12-31 22:00:00
      5698.0
      36356.26172
      16170.0
      4699.097656
      29.060592
      ...
      12
      12.793722
      10.780269
      14.596413
      0.0
      0.0
      0.0
      0.0
      0.0
      1.0
    
    
      473333
      473333
      5
      2015
      11476.0
      2015-12-31 23:00:00
      5365.0
      35150.33984
      16170.0
      4313.125000
      26.673624
      ...
      12
      12.793722
      10.780269
      14.596413
      0.0
      0.0
      0.0
      0.0
      0.0
      1.0
    
  

5 rows × 25 columns



In [9]:

    
y.tail()









    Out[9]:






  
    
      
      Unnamed: 0
      DATETIME
      cluster_id_6
      Gen Change (MW)
    
  
  
    
      473329
      473329
      2015-12-31 19:00:00
      5
      -20.0
    
    
      473330
      473330
      2015-12-31 20:00:00
      5
      -964.0
    
    
      473331
      473331
      2015-12-31 21:00:00
      5
      -608.0
    
    
      473332
      473332
      2015-12-31 22:00:00
      5
      -246.0
    
    
      473333
      473333
      2015-12-31 23:00:00
      5
      -333.0

Drop unnecessary columns and replace nan's with 0



In [7]:

    
X_cols = ['nameplate_capacity', 'GROSS LOAD (MW)', 'ERCOT Load, MW',
          'Total Wind Installed, MW', 'Total Wind Output, MW', 'Net Load Change (MW)',
          'All coal', 'Lignite', 'Subbituminous']
X_cluster_cols = ['cluster_{}'.format(cluster) for cluster in cluster_ids]

X_clean = X.loc[:,X_cols+X_cluster_cols]
X_clean.fillna(0, inplace=True)

y_clean = y.loc[:,'Gen Change (MW)']
y_clean.fillna(0, inplace=True)



In [11]:

    
print X_clean.shape
print y_clean.shape









    



(473334, 15)
(473334,)

Split into training, validation, testing



In [8]:

    
X_train = X_clean.loc[(X['Year']<2012),:]
y_train = y_clean.loc[(X['Year']<2012)]

X_va = X_clean.loc[X['Year'].isin([2012, 2013]),:]
y_va = y_clean.loc[X['Year'].isin([2012, 2013])]

X_test = X_clean.loc[X['Year']>2013,:]
y_test = y_clean.loc[X['Year']>2013]

Somehow we're missing 2 records from X_va



In [13]:

    
print X_va.shape, y_va.shape









    



(105264, 15) (105264,)

Need scaled versions of the X data for some of the models



In [9]:

    
X_train_scaled = StandardScaler().fit_transform(X_train)
X_va_scaled = StandardScaler().fit_transform(X_va)
X_test_scaled = StandardScaler().fit_transform(X_test)

Check size of all arrays



In [15]:

    
print X_train_scaled.shape, y_train.shape
print X_va_scaled.shape, y_va.shape
print X_test_scaled.shape, y_test.shape









    



(262944L, 16L) (262944L,)
(105264L, 16L) (105264L,)
(105126L, 16L) (105126L,)

Linear Regression (OLS)



In [10]:

    
lm = LinearRegression()
lm.fit(X_train_scaled, y_train)









    Out[10]:





LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)



In [11]:

    
lm.score(X_va_scaled, y_va)









    Out[11]:





0.26678565336933879



In [12]:

    
y_pr = lm.predict(X_va_scaled)



In [13]:

    
y_va.values.shape, y_pr.shape, X.loc[X['Year'].isin([2012, 2013]),'cluster'].values.shape









    Out[13]:





((105264,), (105264,), (105264,))



In [14]:

    
y_lm_resids = pd.DataFrame(dict(zip(['Gen Change (MW)', 'y_pr', 'cluster'],
                               [y_va.values, y_pr, X.loc[X['Year'].isin([2012, 2013]),'cluster'].values])))
# y_lm_resids['y_pr'] = y_pr
# y_lm_resids['cluster'] = X.loc[:,'cluster']



In [21]:

    
y_lm_resids.head()









    Out[21]:






  
    
      
      Gen Change (MW)
      cluster
      y_pr
    
  
  
    
      0
      0.0
      0
      -56.223127
    
    
      1
      1.0
      0
      -15.262189
    
    
      2
      -1.0
      0
      -12.223127
    
    
      3
      0.0
      0
      -9.316877
    
    
      4
      0.0
      0
      25.909686



In [15]:

    
y_lm_resids.loc[:,'residuals'] = y_lm_resids.loc[:,'y_pr'] - y_lm_resids.loc[:,'Gen Change (MW)']



In [36]:

    
with sns.axes_style('whitegrid'):
    g = sns.FacetGrid(y_lm_resids, hue='cluster', col='cluster',
                      col_wrap=3)
    g.map(plt.scatter, 'y_pr', 'residuals', s=5, alpha=0.3)
    g.set_xlabels(size=15)
    g.set_ylabels(size=15)
    plt.savefig('OLS residuals.pdf')









    Out[36]:





<seaborn.axisgrid.FacetGrid at 0x12463e290>






    Out[36]:





<seaborn.axisgrid.FacetGrid at 0x12463e290>






    Out[36]:





<seaborn.axisgrid.FacetGrid at 0x12463e290>

LinearSVR

The rbf kernal of SVR is very slow with this large of a dataset. If we want to try using it, should probably subsample.

Shows use with GridSearchCV if you guys want to use it



In [20]:

    
svm = LinearSVR()



In [25]:

    
parameters = {'C':np.logspace(-5, 3, num=15)}



In [26]:

    
lm = GridSearchCV(svm, parameters, n_jobs=-1, verbose=3)

Run the LinearSVR with gridsearch over the 15 parameter values. GridSearchCV does 3-fold CV by default.



In [27]:

    
results = lm.fit(X_train_scaled, y_train)









    



Fitting 3 folds for each of 15 candidates, totalling 45 fits






    



[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    6.6s
[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:  1.0min finished

Validation Curve



In [25]:

    
param_range = np.logspace(-5, 3, num=13)
train_scores, valid_scores = validation_curve(LinearSVR(), X_train_scaled, y_train, 
                                              "C", param_range, n_jobs=-1)



In [26]:

    
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
valid_scores_mean = np.mean(valid_scores, axis=1)
valid_scores_std = np.std(valid_scores, axis=1)


plt.title("Validation Curve - LinearSVR", size=15)
plt.xlabel("C", size=15)
plt.ylabel("Score", size=15)
plt.ylim(0.0, .2)
lw = 2
plt.semilogx(param_range, train_scores_mean, label="Training score",
             color="darkorange", lw=lw)
plt.fill_between(param_range, train_scores_mean - train_scores_std,
                 train_scores_mean + train_scores_std, alpha=0.2,
                 color="darkorange", lw=lw)
plt.semilogx(param_range, valid_scores_mean, label="Cross-validation score",
             color="navy", lw=lw)
plt.fill_between(param_range, valid_scores_mean - valid_scores_std,
                 valid_scores_mean + valid_scores_std, alpha=0.2,
                 color="navy", lw=lw)
plt.legend(loc="best")
plt.savefig('LinearSVR validation curve.pdf', bbox_inches='tight')









    Out[26]:





<matplotlib.text.Text at 0x11af1c910>






    Out[26]:





<matplotlib.text.Text at 0x1244beb10>






    Out[26]:





<matplotlib.text.Text at 0x1194b3690>






    Out[26]:





(0.0, 0.2)






    Out[26]:





[<matplotlib.lines.Line2D at 0x11b9d3350>]






    Out[26]:





<matplotlib.collections.PolyCollection at 0x11b9d3310>






    Out[26]:





[<matplotlib.lines.Line2D at 0x1194a3cd0>]






    Out[26]:





<matplotlib.collections.PolyCollection at 0x117583a90>






    Out[26]:





<matplotlib.legend.Legend at 0x11758e250>

Learning Curve



In [28]:

    
# xLen = len(X_train_scaled)
# tSize = [.1, .2, .3, .4, .5, .6, .7, .8, .9, 1]
train_sizes, train_scores, valid_scores = learning_curve(LinearSVR(), X_train_scaled, 
                                                         y_train, n_jobs=-1)



In [33]:

    
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
valid_scores_mean = np.mean(valid_scores, axis=1)
valid_scores_std = np.std(valid_scores, axis=1)

plt.title("Learning Curve - LinearSVR", size=15)
plt.xlabel("Training Size", size=15)
plt.ylabel("Score", size=15)
plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                 train_scores_mean + train_scores_std, alpha=0.1,
                 color="darkorange")
plt.fill_between(train_sizes, valid_scores_mean - valid_scores_std,
                 valid_scores_mean + valid_scores_std, alpha=0.1, color="navy")
plt.plot(train_sizes, train_scores_mean, 'o-', color="darkorange",
         label="Training score")
plt.plot(train_sizes, valid_scores_mean, 'o-', color="navy",
         label="Cross-validation score")

plt.legend(loc="best")
plt.savefig('LinearSVR learning curve.pdf', bbox_inches='tight')









    Out[33]:





<matplotlib.text.Text at 0x124a4be50>






    Out[33]:





<matplotlib.text.Text at 0x11b71e550>






    Out[33]:





<matplotlib.text.Text at 0x1241ca5d0>






    Out[33]:





<matplotlib.collections.PolyCollection at 0x11d682090>






    Out[33]:





<matplotlib.collections.PolyCollection at 0x11d682190>






    Out[33]:





[<matplotlib.lines.Line2D at 0x124b495d0>]






    Out[33]:





[<matplotlib.lines.Line2D at 0x124b49b90>]






    Out[33]:





<matplotlib.legend.Legend at 0x124b49d10>

cv_results_ returns a dictionary with all of the results and parameters



In [28]:

    
results.cv_results_









    Out[28]:





{'mean_fit_time': array([  0.29466669,   0.31133342,   0.53800003,   0.65266665,
          0.80899994,   0.78433331,   0.81199996,   0.87266668,
          0.9866666 ,   1.28399992,   2.15866669,   4.79399999,
         11.95500008,  25.98933331,  43.23566675]),
 'mean_score_time': array([ 0.03633332,  0.03933334,  0.08633327,  0.01066677,  0.0126667 ,
         0.00999999,  0.01366663,  0.01433341,  0.01833336,  0.01333332,
         0.01299993,  0.01333332,  0.01366663,  0.01099992,  0.00766659]),
 'mean_test_score': array([ 0.00287043,  0.00987067,  0.03003211,  0.07128326,  0.11854968,
         0.1501322 ,  0.1644035 ,  0.16960687,  0.17144693,  0.17192847,
         0.17160952,  0.17128199,  0.17042826,  0.17088725,  0.16581553]),
 'mean_train_score': array([ 0.00287357,  0.00988178,  0.03007964,  0.07148156,  0.11913756,
         0.1513134 ,  0.16617615,  0.17197534,  0.17427679,  0.17499071,
         0.17536349,  0.17557742,  0.17513131,  0.17544765,  0.17128382]),
 'param_C': masked_array(data = [1.0000000000000001e-05 3.7275937203149381e-05 0.00013894954943731373
  0.0005179474679231213 0.0019306977288832496 0.0071968567300115137
  0.026826957952797246 0.10000000000000001 0.37275937203149379
  1.3894954943731359 5.1794746792312019 19.306977288832496 71.96856730011514
  268.26957952797216 1000.0],
              mask = [False False False False False False False False False False False False
  False False False],
        fill_value = ?),
 'params': ({'C': 1.0000000000000001e-05},
  {'C': 3.7275937203149381e-05},
  {'C': 0.00013894954943731373},
  {'C': 0.0005179474679231213},
  {'C': 0.0019306977288832496},
  {'C': 0.0071968567300115137},
  {'C': 0.026826957952797246},
  {'C': 0.10000000000000001},
  {'C': 0.37275937203149379},
  {'C': 1.3894954943731359},
  {'C': 5.1794746792312019},
  {'C': 19.306977288832496},
  {'C': 71.96856730011514},
  {'C': 268.26957952797216},
  {'C': 1000.0}),
 'rank_test_score': array([15, 14, 13, 12, 11, 10,  9,  7,  3,  1,  2,  4,  6,  5,  8]),
 'split0_test_score': array([ 0.00301764,  0.01030828,  0.03116889,  0.07332238,  0.12036935,
         0.15154889,  0.16511589,  0.1696596 ,  0.17094722,  0.17212156,
         0.17179597,  0.17210909,  0.17093621,  0.1711447 ,  0.17189868]),
 'split0_train_score': array([ 0.00295224,  0.01009817,  0.03066308,  0.07290477,  0.12169576,
         0.15579254,  0.17192734,  0.17853343,  0.1809065 ,  0.18234366,
         0.18280627,  0.18269438,  0.18187619,  0.18167064,  0.1834368 ]),
 'split1_test_score': array([ 0.00296258,  0.01014405,  0.03052811,  0.07106492,  0.11597412,
         0.14511882,  0.15822912,  0.1627618 ,  0.16480322,  0.16443405,
         0.16456281,  0.16445884,  0.16459095,  0.16665469,  0.15697647]),
 'split1_train_score': array([ 0.00304868,  0.01044205,  0.0314778 ,  0.0736199 ,  0.12111131,
         0.15264044,  0.16698589,  0.17228259,  0.17506954,  0.17496312,
         0.1752573 ,  0.17530995,  0.17516148,  0.17714316,  0.16690628]),
 'split2_test_score': array([ 0.00263108,  0.00915966,  0.02839933,  0.06946248,  0.11930557,
         0.15372889,  0.16986548,  0.1763992 ,  0.17859036,  0.17922978,
         0.17846979,  0.17727803,  0.17575762,  0.17486237,  0.16857143]),
 'split2_train_score': array([ 0.00261979,  0.00910511,  0.02809803,  0.06792001,  0.11460562,
         0.14550722,  0.15961522,  0.16510999,  0.16685434,  0.16766536,
         0.1680269 ,  0.16872794,  0.16835626,  0.16752916,  0.1635084 ]),
 'std_fit_time': array([  1.24723493e-03,   1.64992388e-02,   1.03405361e-01,
          1.92238952e-02,   2.90631870e-02,   1.26376020e-01,
          5.12314671e-02,   7.84064085e-02,   1.09332276e-01,
          1.44589107e-01,   2.24553307e-01,   4.04309506e-02,
          2.89872308e-01,   5.01694043e-01,   1.52403465e+00]),
 'std_score_time': array([ 0.00402773,  0.0037713 ,  0.08738544,  0.00235708,  0.00169974,
         0.        ,  0.00309114,  0.00339932,  0.00825964,  0.00094274,
         0.00141417,  0.00169974,  0.00169955,  0.00163307,  0.00047137]),
 'std_test_score': array([ 0.00017074,  0.00050721,  0.00118382,  0.00158335,  0.00187226,
         0.00365501,  0.00477716,  0.00556757,  0.00563966,  0.00604188,
         0.00567903,  0.00526599,  0.0045729 ,  0.00335571,  0.00639606]),
 'std_train_score': array([ 0.00018371,  0.00056685,  0.00144014,  0.00253526,  0.00321344,
         0.00430254,  0.00505891,  0.0054844 ,  0.00576409,  0.00599242,
         0.00603412,  0.00570491,  0.00551953,  0.00589641,  0.00870469])}



In [29]:

    
test_score = results.cv_results_['mean_test_score']
train_score = results.cv_results_['mean_train_score']



In [30]:

    
C = [results.cv_results_['params'][x]['C'] for x in range(15)]



In [31]:

    
C









    Out[31]:





[1.0000000000000001e-05,
 3.7275937203149381e-05,
 0.00013894954943731373,
 0.0005179474679231213,
 0.0019306977288832496,
 0.0071968567300115137,
 0.026826957952797246,
 0.10000000000000001,
 0.37275937203149379,
 1.3894954943731359,
 5.1794746792312019,
 19.306977288832496,
 71.96856730011514,
 268.26957952797216,
 1000.0]

Plot the score for each value of C



In [32]:

    
plt.semilogx(C, test_score, C, train_score)
plt.legend(['test_score', 'train_score'])









    Out[32]:





<matplotlib.legend.Legend at 0x121b05f8>

SVR with rbf

Try subsampling?



In [33]:

    
X_train_scaled.shape









    Out[33]:





(262944L, 16L)



In [38]:

    
idx = np.random.choice(np.arange(len(X_train_scaled)), size=int(len(X_train_scaled)*0.5), replace=False)



In [39]:

    
lm = SVR()



In [40]:

    
lm.fit(X_train_scaled[idx], y_train[idx])









    Out[40]:





SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='auto',
  kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)



In [41]:

    
lm.score(X_va_scaled, y_va)









    Out[41]:





0.28122217024273011

Learning Curve



In [69]:

    
xLen = len(X_train_scaled)
tSize = [.1, .2, .3, .4, .5]
train_sizes, train_scores, valid_scores = learning_curve(SVR(), X_train_scaled, y_train, train_sizes = tSize, n_jobs = -1, verbose = 3)

train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
valid_scores_mean = np.mean(valid_scores, axis=1)
valid_scores_std = np.std(valid_scores, axis=1)

plt.grid()

plt.title("Validation Curve - SVR")
plt.fill_between(tSize, train_scores_mean - train_scores_std,
                 train_scores_mean + train_scores_std, alpha=0.1,
                 color="darkorange")
plt.fill_between(tSize, valid_scores_mean - valid_scores_std,
                 valid_scores_mean + valid_scores_std, alpha=0.1, color="navy")
plt.plot(tSize, train_scores_mean, 'o-', color="darkorange",
         label="Training score")
plt.plot(tSize, valid_scores_mean, 'o-', color="navy",
         label="Cross-validation score")

plt.legend(loc="best")
plt.show









    



[learning_curve] Training set sizes: [17529 35059 52588 70118 87648]






    



[Parallel(n_jobs=-1)]: Done   4 out of  15 | elapsed:  7.0min remaining: 19.2min
[Parallel(n_jobs=-1)]: Done  10 out of  15 | elapsed: 20.1min remaining: 10.0min
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed: 42.0min finished






    Out[69]:





<function matplotlib.pyplot.show>

	Unnamed: 0	Year	nameplate_capacity	DATETIME	GROSS LOAD (MW)	ERCOT Load, MW	Total Wind Installed, MW	Total Wind Output, MW	Wind Output, % of Installed	Wind Output, % of Load	1-hr MW change	1-hr % change	Net Load (MW)	Net Load Change (MW)	Month	NG Price ($/mcf)	All coal	Lignite	Subbituminous
0	0	2007	5949.0	2007-01-01 00:00:00	4596.0	30428.0	2790.0	1074.0	38.494624	3.529644	NaN	NaN	29354.0	NaN	1	6.42	25.1475	20.0275	28.115
1	1	2007	5949.0	2007-01-01 01:00:00	4566.0	30133.0	2790.0	922.6	33.068100	3.061760	-151.4	-14.096834	29210.4	-143.6	1	6.42	25.1475	20.0275	28.115
2	2	2007	5949.0	2007-01-01 02:00:00	4667.0	29941.0	2790.0	849.2	30.437276	2.836245	-73.4	-7.955777	29091.8	-118.6	1	6.42	25.1475	20.0275	28.115
3	3	2007	5949.0	2007-01-01 03:00:00	4668.0	29949.0	2790.0	1056.3	37.860215	3.526996	207.1	24.387659	28892.7	-199.1	1	6.42	25.1475	20.0275	28.115
4	4	2007	5949.0	2007-01-01 04:00:00	4685.0	30248.0	2790.0	837.1	30.003584	2.767456	-219.2	-20.751680	29410.9	518.2	1	6.42	25.1475	20.0275	28.115

	Unnamed: 0	cluster	Year	nameplate_capacity	DATETIME	GROSS LOAD (MW)	ERCOT Load, MW	Total Wind Installed, MW	Total Wind Output, MW	Wind Output, % of Installed	...	Month	All coal	Lignite	Subbituminous	cluster_5
473329	473329	5	2015	11476.0	2015-12-31 19:00:00	7516.0	39908.77734	16170.0	3824.932373	23.654498	...	12	12.793722	10.780269	14.596413	1.0
473330	473330	5	2015	11476.0	2015-12-31 20:00:00	6552.0	38736.85938	16170.0	4625.632813	28.606264	...	12	12.793722	10.780269	14.596413	1.0
473331	473331	5	2015	11476.0	2015-12-31 21:00:00	5944.0	37587.70313	16170.0	4957.714844	30.659956	...	12	12.793722	10.780269	14.596413	1.0
473332	473332	5	2015	11476.0	2015-12-31 22:00:00	5698.0	36356.26172	16170.0	4699.097656	29.060592	...	12	12.793722	10.780269	14.596413	1.0
473333	473333	5	2015	11476.0	2015-12-31 23:00:00	5365.0	35150.33984	16170.0	4313.125000	26.673624	...	12	12.793722	10.780269	14.596413	1.0

	Gen Change (MW)	y_pr
0	0.0	-56.223127
1	1.0	-15.262189
2	-1.0	-12.223127
3	0.0	-9.316877
4	0.0	25.909686